#########################################################################
# FigureE2.R
#########################################################################

library(tidyverse)

#########################################################################
# 1. Process Data
#########################################################################

dt <- read.csv("qualtrics_Alaska_nonnumeric.csv")[-c(1:2), ] %>%
  mutate(id = row_number())

#########################################################################
# 2. Forced Ranking
#########################################################################

# Reference choice set

ref_set <- c("Begich", "Bye", "Palin", "Peltola")

# Decompose ballot position x candidate
dtc <- dt %>% separate(C4_DO, into=c("C4_pos1", "C4_pos2", "C4_pos3", 
                                     "C4_pos4"), 
                       sep = "[|]", remove=F) %>% # Separate by "|"
  dplyr::select(id, starts_with("C4"),-c("C4_DO")) %>%
  pivot_longer(cols = C4_1:C4_4,
               values_to = "rank") %>%
  dplyr::select(id, name, rank, everything()) %>%
  mutate(name = case_when(name ==  "C4_1" ~ ref_set[1],
                          name ==  "C4_2" ~ ref_set[2],
                          name ==  "C4_3" ~ ref_set[3],
                          name ==  "C4_4" ~ ref_set[4]),
         rank = as.numeric(rank))

# Count the total occurrences of each 'name'
total_counts <- dtc %>% group_by(name) %>% summarise(total_count = n())

# Filter the dataframe for rows where rank is 1
filtered_data <- dtc %>% filter(rank == 1)

# Count the occurrences of each 'name' where rank is 1
name_counts <- filtered_data %>% group_by(name) %>% summarise(count = n())

# Merge total counts with counts where rank is 1
merged_data <- left_join(name_counts, total_counts, by = "name")

# Calculate the percentage and create a new column 'percentage'
merged_data <- merged_data %>% mutate(percentage = count / total_count * 100)

# Print the result
print(merged_data)

################################################################################
### Estimate the voter share and compute SE

name_counts <- table(filtered_data$name)
name_proportions <- prop.table(name_counts)

# Compute standard errors
name_se <- sqrt(name_proportions * (1 - name_proportions) / sum(name_counts))

# Compute 95% confidence intervals
z_score_95 <- qnorm(0.975)  # Z-score for a 95% confidence interval
z_score_95 <- qnorm(0.95)  # Z-score for a 90% confidence interval

ci_lower <- name_proportions - z_score_95 * name_se
ci_upper <- name_proportions + z_score_95 * name_se

# Display results
result_table_full <- data.frame(
  Name = names(name_proportions),
  Proportion = name_proportions,
  Standard_Error = name_se,
  CI_Lower = ci_lower,
  CI_Upper = ci_upper
)


# Multiply CI_Lower.Freq and CI_Upper.Freq by 100
result_table_full$CI_Lower.Freq <- result_table_full$CI_Lower.Freq * 100
result_table_full$CI_Upper.Freq <- result_table_full$CI_Upper.Freq * 100

result_table_full$dataset <- "Forced-Ranking Question"
colnames(result_table_full)[1] <- "name"


#########################################################################
# 3. Optional Ranking
#########################################################################

# Decompose ballot position x candidate

dtc <- dt %>% separate(C3_DO, into=c("C3_pos1", "C3_pos2", "C3_pos3", 
                                     "C3_pos4"), 
                       sep = "[|]", remove=F) %>% # Separate by "|"
  dplyr::select(id, starts_with("C3"),-c("C3_DO")) %>%
  pivot_longer(cols = C3_1:C3_4,
               values_to = "rank") %>%
  mutate(name = case_when(name ==  "C3_1" ~ ref_set[1],
                          name ==  "C3_2" ~ ref_set[2],
                          name ==  "C3_3" ~ ref_set[3],
                          name ==  "C3_4" ~ ref_set[4]),
         rank = as.numeric(rank),
         selection = ifelse(is.na(rank), 0, 1)) %>%
  dplyr::select(id, name, selection, rank, everything())


# Count the total occurrences of each 'name'
total_counts <- dtc %>% group_by(name) %>% summarise(total_count = n())

# Filter the dataframe for rows where rank is 1
filtered_data <- dtc %>% filter(rank == 1)

# Count the occurrences of each 'name' where rank is 1
name_counts <- filtered_data %>% group_by(name) %>% summarise(count = n())

# Merge total counts with counts where rank is 1
merged_data1 <- left_join(name_counts, total_counts, by = "name")

# Calculate the percentage and create a new column 'percentage'
merged_data1 <- merged_data1 %>% mutate(percentage = count / total_count * 100)

# Print the result
print(merged_data1)


################################################################################
### Estimate the voter share and compute SE

name_counts <- table(filtered_data$name)
name_proportions <- prop.table(name_counts)

# Compute standard errors
name_se <- sqrt(name_proportions * (1 - name_proportions) / sum(name_counts))

# Compute 95% confidence intervals
z_score_95 <- qnorm(0.975)  # Z-score for a 95% confidence interval
z_score_95 <- qnorm(0.95)  # Z-score for a 90% confidence interval

ci_lower <- name_proportions - z_score_95 * name_se
ci_upper <- name_proportions + z_score_95 * name_se

# Display results
result_table_par <- data.frame(
  Name = names(name_proportions),
  Proportion = name_proportions,
  Standard_Error = name_se,
  CI_Lower = ci_lower,
  CI_Upper = ci_upper
)


# Multiply CI_Lower.Freq and CI_Upper.Freq by 100
result_table_par$CI_Lower.Freq <- result_table_par$CI_Lower.Freq * 100
result_table_par$CI_Upper.Freq <- result_table_par$CI_Upper.Freq * 100

result_table_par$dataset <- "Optional-Ranking Question"
colnames(result_table_par)[1] <- "name"


#########################################################################
# 4. Official Records
#########################################################################

# Combine the data frames into one for easier plotting
house_combined_data <- bind_rows(
  mutate(merged_data, dataset = "Forced-Ranking Question"),
  mutate(merged_data1, dataset = "Optional-Ranking Question")
)
# Reorder the 'name' factor based on 'percentage' in each dataset
house_combined_data$name <- reorder(house_combined_data$name, -house_combined_data$percentage)

# Map official_election_vote_share values based on 'name'
vote_share_mapping <- c("Peltola" = 48.7, "Palin" = 25.8, 
                        "Begich" = 23.6, "Bye" = 1.9)

# Create a new dataframe for the additional mapping results
additional_mapping <- data.frame(
  name = names(vote_share_mapping),
  percentage = unname(vote_share_mapping),
  dataset = "Actual Election Data"
)

# Append the additional mapping results to the senate_combined_data dataframe
house_combined_data <- bind_rows(house_combined_data, additional_mapping)

# US House Results with Confidence Intervals
combined_table <- rbind(result_table_par, result_table_full)

# Add the Confidence Interval Results into the final dataframe used for plotting 
house_combined_data <- merge(house_combined_data, combined_table[, c("name", "dataset", "CI_Lower.Freq", "CI_Upper.Freq")], by = c("name", "dataset"), all.x = TRUE)

#########################################################################
# 5. Generate Figure E2
#########################################################################

house_combined_data <- house_combined_data %>%
  mutate(new_var = ifelse(dataset == "Actual Election Data", percentage, NA)) %>%
  group_by(name) %>%
  fill(new_var, .direction = "down") %>%
  ungroup()

house_combined_data$name <- fct_reorder(house_combined_data$name, 
                                         house_combined_data$new_var)

# Plot 1C - Scatter plot with error bars, differentiated by dataset
ggplot(house_combined_data, aes(x = name, y = percentage, fill = dataset, shape = dataset, color = dataset)) +
  geom_point(position = position_dodge(width = 0.5), size = 3) +  # Scatter plot
  geom_errorbar(aes(ymin = CI_Lower.Freq, ymax = CI_Upper.Freq),
                position = position_dodge(width = 0.5), width = 0) +  # Error bars
  
  labs(title = "US House Election Results - Alaska",
       x = "Candidate Name",
       y = "Vote Share (%)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
        axis.title = element_text(size = 14),
        plot.title = element_text(size = 16, hjust = 0.5),
        legend.position = "bottom",
        legend.text = element_text(size = 12),
        legend.title = element_blank()) +
  scale_shape_manual(values = c(17, 16, 15)) +  # Use different point shapes
  scale_color_manual(values = c("darkblue", "darkred","darkgreen")) +  # Use different colors
  scale_fill_manual(values = c("darkblue", "darkred","darkgreen"))    # Use different fill colors

# Display the plot
ggsave("FigureE2.pdf", width = 10, height = 6)  # Specify the dimensions of the saved plot

#############################################################################
# END OF THIS R SOURCE FILE
#############################################################################

